#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr)
library(biomaRt)
library(anRichment) ; library(BrainDiseaseCollection)
suppressWarnings(suppressMessages(library(WGCNA)))

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_21_WGCNA.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))


clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]

dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]

# Correct SFARI Scores
dataset$gene.score = genes_info$gene.score

# Enrichment Analysis
load('./../Data/enrichmentAnalysis.RData')


SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

rm(DE_info, GO_annotations, clusterings, getinfo, mart, dds, GO_neuronal)

Calculate the Enrichment of SFARI Genes in Modules


To measure the presence of SFARI Genes in a module, at first we were just measuring the percentage of genes in the module that belonge to SFARI, but this approach does not take into account the size of the module, and because of this it doesn’t take into consideration the robustness of the results (it’s easier to get a high percentage of SFARI genes in a small module by chance than in a larger module). Because of this, we chose to use the following approach:

If we assume independence between SFARI Genes and modules, we can calculate the probability of obtaining a proportion of SFARI Genes at least as big as the one found in each module given its size


Notation:


If we interpret the number of genes (\(n\)) in a module as \(n\) random draws without replacement from a finite population of size \(N\), and the number of SFARI genes in the module (\(k\)) as \(k\) successes in those \(n\) draws, where we know that \(N\) contains exactly \(K\) successes, then we can use the Hypergeometric Distribution to calculate the statistical significance of having drawn \(k\) successes out of \(n\) draws, and use this value to select the clusters with the highest enrichment



Relation between SFARI Enrichment and Module-Diagnosis correlation


  • For Modules with negative correlation to ASD there doesn’t seem to be a relation between this correlation and the enrichment of the SFARI Genes of the module, but for Modules with positive correlation, the higher the correlation, the smaller the enrichment of SFARI Genes

  • SFARI genes seem to be strongly concentrated in a group of modules and almost abstent in others (many modules with a probability close to 1 and many with a probability close to 0, with not so many in between)

  • The size of the modules does play a part in the probabilities, with the smallest modules having on average less extreme probabilities than the rest of the modules

SFARI_genes_by_module = dataset %>% mutate('hasSFARIscore' = !gene.score %in% c('None', 'Neuronal')) %>% 
                        group_by(Module, MTcor, hasSFARIscore) %>% summarise(s=n()) %>% 
                        left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
                        mutate(perc=round(s/n*100,2)) %>% filter(hasSFARIscore & Module != 'gray') %>% arrange(desc(perc)) %>% ungroup

N = sum(SFARI_genes_by_module$n)
S = sum(SFARI_genes_by_module$s)

calc_prob = function(row, log.p, S){
  s = row[4] %>% as.numeric
  n = row[5] %>% as.numeric
  prob = phyper(s, S, N-S, n, log.p = log.p, lower.tail=FALSE)
  return(prob)
}

SFARI_genes_by_module$prob = apply(SFARI_genes_by_module, 1, function(x) calc_prob(x, FALSE, S))
SFARI_genes_by_module$adj_prob = p.adjust(SFARI_genes_by_module$prob, method = 'bonferroni')
SFARI_genes_by_module = SFARI_genes_by_module %>% arrange(prob)

ggplotly(SFARI_genes_by_module %>% ggplot(aes(MTcor, prob, size=n)) + geom_point(color=SFARI_genes_by_module$Module, alpha=0.5, aes(id=Module)) + 
         geom_smooth(color='#cccccc', size = 0.5, se=FALSE) + xlab('Module-Diagnosis Correlation') + ylab('Probability') + 
         ggtitle(paste0('Corr = ', round(cor(SFARI_genes_by_module$MTcor, SFARI_genes_by_module$prob),2), ': Corr[Module-ASD corr<0] = ', 
                        round(cor(SFARI_genes_by_module$MTcor[SFARI_genes_by_module$MTcor<0], SFARI_genes_by_module$prob[SFARI_genes_by_module$MTcor<0]),3),
                        ' Corr[Module-ASD corr>0] = ',
                        round(cor(SFARI_genes_by_module$MTcor[SFARI_genes_by_module$MTcor>=0], SFARI_genes_by_module$prob[SFARI_genes_by_module$MTcor>=0]),2))) +
         theme_minimal() + theme(legend.position = 'none'))

It’s weird that the Module with the highest (positive) correlation to ASD have less enrichment in SFARI Genes than the rest of the Modules. This seems to be because even though SFARI Genes are quite balanced between over-and under-expressed gemes, they have lower lFC values than the rest of the genes in the over-expressed group

# !diagnostics off
plot_data = genes_info %>% mutate(label = ifelse(!gene.score %in% c('Neuronal', 'None'), 'SFARI', gene.score)) %>% dplyr::select(label, log2FoldChange) %>%
            mutate(class = factor(label, levels = c('None', 'Neuronal', 'SFARI')),
                   quant = cut(log2FoldChange, breaks = quantile(log2FoldChange, probs = seq(0,1,0.05)) %>% as.vector, labels = FALSE)) %>% 
            filter(label == 'SFARI') %>% group_by(quant) %>% tally %>% ungroup

ggplotly(genes_info %>% mutate(direction = factor(ifelse(log2FoldChange<0, 'under-expressed', 'over-expressed'), levels = c('under-expressed', 'over-expressed')),
                               label = factor(ifelse(!gene.score %in% c('Neuronal', 'None'), 'SFARI', gene.score), levels = c('None', 'Neuronal', 'SFARI'))) %>% 
         ggplot(aes(x=direction, fill = label)) + geom_bar(position = 'fill') + ggtitle('Proportion of SFARI Genes for under- and over-expressed genes') +
         scale_fill_manual(values = c('#b3b3b3','#808080','#ff6600')) + ylab('Proportion') + xlab('Direction') + scale_y_sqrt() + theme_minimal())
ggplotly(plot_data %>% ggplot(aes(quant, n)) + geom_bar(stat = 'identity', fill = '#ff6600') + geom_smooth(color = 'gray', alpha = 0.3) + 
         xlab('Log Fold Change Quantiles') + ylab('Number of SFARI Genes') + ggtitle('Number of SFARI Genes for lFC Quantiles') + theme_minimal())

If we separate the SFARI Genes by score we find the same pattern for each score, although much noisier

calc_prob_by_SFARI_score = function(score){
  
  df = dataset %>% filter(gene.score == score) %>% group_by(Module, gene.score) %>% summarise(s = n()) %>%
       filter(Module != 'gray') %>% ungroup %>% right_join(SFARI_genes_by_module %>% dplyr::select(Module, MTcor, n), by = 'Module') %>%
       mutate(gene.score = '1', color = SFARI_colour_hue(as.numeric(score)), s = ifelse(is.na(s), 0, s)) %>% dplyr::select(Module, MTcor, gene.score, s, n, color)
  
  df$prob = apply(df, 1, function(x) calc_prob(x, FALSE, sum(dataset$gene.score == score)))
  df$adj_prob = p.adjust(df$prob, method = 'bonferroni')
  
  return(df)
}


plot_data = c()

for(score in names(table(dataset$gene.score[!dataset$gene.score %in% c('Neuronal','None', 'Others')]))){
  score_info = calc_prob_by_SFARI_score(score)
  plot_data = rbind(plot_data, score_info)
}


ggplotly(plot_data %>% ggplot(aes(MTcor, prob, size=n, color = color)) + geom_point(alpha=0.5, aes(id=Module)) + geom_smooth(size = 0.5, se=FALSE) +
         xlab('Module-Diagnosis Correlation') + scale_colour_manual(values = SFARI_colour_hue(r=1:6)) + ylab('Probability') + 
         ggtitle('Enrichment by SFARI Gene Score') + theme_minimal() + theme(legend.position = 'none'))



Selecting Top Modules Enriched in SFARI Genes


We can interpet the probability we obtain as a p-value (and correct it for multiple testing), we can use it as a threshold to identify modules with a significantly high percentage of SFARI genes (adjusted p-value < 0.01)

Using log-scale to help us differentiate between small differences close to zero better

ggplotly(SFARI_genes_by_module %>% ggplot(aes(MTcor, adj_prob, size=n)) + geom_point(color=SFARI_genes_by_module$Module, alpha=0.5, aes(id=Module)) +
         geom_hline(yintercept = 0.05, color = 'gray', linetype = 'dotted') + xlab('Module-Diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
         theme_minimal() + theme(legend.position = 'none'))
top_modules = SFARI_genes_by_module$Module[SFARI_genes_by_module$adj_prob <0.05]

cat(paste0('Keeping top ', length(top_modules),' modules: ', paste(top_modules, collapse = ', ')))
## Keeping top 6 modules: #B79F00, #00A8FF, #FE61CF, #619CFF, #00B9E3, #00BECF
rm(N,S,calc_prob)

Exploratory Analysis of Top Modules


PCA


The genes belonging to the modules enriched in SARI genes seem to be distributed in all of the PC space except for the highest values of PC2, which we know correspond to over-expressed genes. This agrees with the positive slope at the end of the trend line in the plot above

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% left_join(dataset, by='ID') %>% 
            dplyr::select(ID, PC1, PC2, Module, gene.score) %>% mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules), alpha = ifelse(ImportantModules=='Others', 0.1, 0.3))

table(plot_data$ImportantModules)
## 
## #00A8FF #00B9E3 #00BECF #619CFF #B79F00 #FE61CF  Others 
##     270    1733     291     459     174     315   12905
p = plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + geom_point(alpha=plot_data$alpha) + 
    scale_colour_manual(values =  c(names(table(plot_data$ImportantModules))[-(length(top_modules)+1)],'gray')) + 
    theme_minimal() + theme(legend.position = 'none') + ggtitle('Modules with the most significant presence of SFARI Genes')

for(tm in top_modules){
  p = p + geom_hline(yintercept = mean(plot_data$PC2[plot_data$Module==tm]), color = tm, linetype = 'dashed') + 
          geom_vline(xintercept = mean(plot_data$PC1[plot_data$Module==tm]), color = tm, linetype = 'dashed')
}

ggExtra::ggMarginal(p, type='density', groupColour = TRUE, size=10)


Relation between gene’s Module Membership and Gene Significance


create_plot = function(module){
  
  plot_data = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module)
  colnames(plot_data)[2] = 'Module'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='None'])))
  SFARI_colors = SFARI_colors[!is.na(SFARI_colors)]
  
  p = ggplotly(plot_data %>% ggplot(aes(Module, GS, color=gene.score)) + geom_point(alpha=0.5, aes(ID=ID)) + ylab('Gene Significance') +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,8,7))) + theme_minimal() + xlab('Module Membership') +
               ggtitle(paste0('Module ', module,'  (MTcor = ', round(moduleTraitCor[paste0('ME',module)][[1]],2),')')))
  
  return(p)
}

datTraits = datMeta %>% dplyr::select(Diagnosis)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
names(moduleTraitCor) = colnames(MEs)

create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
create_plot(top_modules[4])
create_plot(top_modules[5])
create_plot(top_modules[6])
rm(datTraits, ME_object, MEs, create_plot)


Functional Enrichment Analysis

i = 1
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #B79F00 (SFARI Genes = 15.52%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 0.0008823 0.0000011 1.850606 168 4036 80
GO:0051252 regulation of RNA metabolic process GO|GO.BP 0.2041583 0.0001640 1.881450 168 3027 61
GO:0090304 nucleic acid metabolic process GO|GO.BP 0.2908464 0.0002255 1.670297 168 4304 77
GO:2000112 regulation of cellular macromolecule biosynthetic process GO|GO.BP 0.2970930 0.0002299 1.847594 168 3133 62
GO:0019219 regulation of nucleobase-containing compound metabolic process GO|GO.BP 0.3413090 0.0002605 1.812876 168 3296 64
GO:0010556 regulation of macromolecule biosynthetic process GO|GO.BP 1.0000000 0.0007780 1.778891 168 3254 62
GO:0031326 regulation of cellular biosynthetic process GO|GO.BP 1.0000000 0.0007603 1.754842 168 3405 64
GO:0009889 regulation of biosynthetic process GO|GO.BP 1.0000000 0.0006901 1.748373 168 3471 65
GO:0016070 RNA metabolic process GO|GO.BP 1.0000000 0.0010016 1.683317 168 3827 69
GO:0005634 nucleus GO|GO.CC 1.0000000 0.0013611 1.475657 168 5884 93
i = 2
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #00A8FF (SFARI Genes = 12.96%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000188 CortexWGCNA midfetal M18 JA Miller at AIBS|Brain|Prenatal brain|WGCNA 0.0007389 0.0000010 3.949958 261 426 28
JAMiller.AIBS.000349 Genes bound by KDM5B in MOUSE MESC from PMID 21448134 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0017535 0.0000021 1.824771 261 2964 90
JAMiller.AIBS.000161 Genes increasing with age in fetal PFC JA Miller at AIBS|Brain|Prenatal brain|Age-associated genes|Cortex 1.0000000 0.0229150 2.258634 261 745 28
JAMiller.AIBS.000078 Cerebellar granule cells JA Miller at AIBS|Brain|Postnatal brain|Cell type markers 1.0000000 0.0197433 1.827103 261 1513 46
GO:2000112 regulation of cellular macromolecule biosynthetic process GO|GO.BP 1.0000000 0.0207200 1.515342 261 3133 79
GO:0010556 regulation of macromolecule biosynthetic process GO|GO.BP 1.0000000 0.0150683 1.514399 261 3254 82
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 1.0000000 0.0030512 1.488994 261 4036 100
GO:0009889 regulation of biosynthetic process GO|GO.BP 1.0000000 0.0260383 1.471663 261 3471 85
JAMiller.AIBS.000268 Genes bound by EGR1 in HUMAN ERYTHROLEUKEMIA from PMID 20690147 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 1.0000000 0.0269462 1.369208 261 4828 110
GO:0050789 regulation of biological process GO|GO.BP 1.0000000 0.0224722 1.207686 261 8957 180
i = 3
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #FE61CF (SFARI Genes = 12.06%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000188 CortexWGCNA midfetal M18 JA Miller at AIBS|Brain|Prenatal brain|WGCNA 0.0000000 0.0000000 7.058003 313 426 60
JAMiller.AIBS.000195 RegionalWGCNA midfetal M25 JA Miller at AIBS|Brain|Prenatal brain|WGCNA 0.0000000 0.0000000 8.176139 313 190 31
JAMiller.AIBS.000048 CortexWGCNA 15-21 post-conception weeks C22 CPenriched enrichedForAutismGenes JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 0.0000000 0.0000000 3.962714 313 607 48
JAMiller.AIBS.000044 CortexWGCNA 15-21 post-conception weeks C18 JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 0.0000097 0.0000000 3.140713 313 718 45
JAMiller.AIBS.000268 Genes bound by EGR1 in HUMAN ERYTHROLEUKEMIA from PMID 20690147 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0001385 0.0000002 1.556912 313 4828 150
JAMiller.AIBS.000155 Lowest in VZ of E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex 0.0004462 0.0000006 2.138185 313 1664 71
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 0.0006891 0.0000009 1.614107 313 4036 130
JAMiller.AIBS.000257 Genes bound by DMRT1 in MOUSE TESTES from PMID 23473982 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0747682 0.0000662 1.969328 313 1654 65
JAMiller.AIBS.000245 Genes bound by CREB1 in RAT HIPPOCAMPUS from PMID 23762244 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.1714689 0.0001401 1.823144 313 2034 74
JAMiller.AIBS.000124 HippocampusWGCNA yellow JA Miller at AIBS|Brain|Postnatal brain|WGCNA 0.3129563 0.0002413 2.590714 313 677 35
i = 4
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #619CFF (SFARI Genes = 9.59%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
GO:0090304 nucleic acid metabolic process GO|GO.BP 0.00e+00 0e+00 1.753458 451 4304 217
GO:0016070 RNA metabolic process GO|GO.BP 0.00e+00 0e+00 1.817521 451 3827 200
GO:0010467 gene expression GO|GO.BP 0.00e+00 0e+00 1.678789 451 4454 215
GO:0005634 nucleus GO|GO.CC 0.00e+00 0e+00 1.536769 451 5884 260
GO:0003676 nucleic acid binding GO|GO.MF 0.00e+00 0e+00 1.839548 451 3214 170
GO:0006139 nucleobase-containing compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.610303 451 4881 226
GO:0046483 heterocycle metabolic process GO|GO.BP 0.00e+00 0e+00 1.578302 451 5002 227
GO:0006725 cellular aromatic compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.567335 451 5037 227
GO:0051252 regulation of RNA metabolic process GO|GO.BP 0.00e+00 0e+00 1.826807 451 3027 159
GO:0031981 nuclear lumen GO|GO.CC 0.00e+00 0e+00 1.709029 451 3724 183
GO:0019219 regulation of nucleobase-containing compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.772679 451 3296 168
JAMiller.AIBS.000349 Genes bound by KDM5B in MOUSE MESC from PMID 21448134 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.00e+00 0e+00 1.830435 451 2964 156
GO:0034641 cellular nitrogen compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.507378 451 5445 236
GO:0044428 nuclear part GO|GO.CC 0.00e+00 0e+00 1.637542 451 4014 189
GO:1901360 organic cyclic compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.518497 451 5199 227
GO:0010468 regulation of gene expression GO|GO.BP 0.00e+00 0e+00 1.683595 451 3615 175
GO:0005654 nucleoplasm GO|GO.CC 2.00e-07 0e+00 1.723000 451 3169 157
GO:0006355 regulation of transcription, DNA-templated GO|GO.BP 3.00e-07 0e+00 1.785435 451 2766 142
GO:0006351 transcription, DNA-templated GO|GO.BP 8.00e-07 0e+00 1.740690 451 2937 147
GO:1903506 regulation of nucleic acid-templated transcription GO|GO.BP 9.00e-07 0e+00 1.759835 451 2826 143
GO:0032774 RNA biosynthetic process GO|GO.BP 9.00e-07 0e+00 1.729050 451 2997 149
GO:2001141 regulation of RNA biosynthetic process GO|GO.BP 1.10e-06 0e+00 1.754248 451 2835 143
JAMiller.AIBS.000209 RegionalWGCNA midfetal M39 AlternativeSplicing JA Miller at AIBS|Brain|Prenatal brain|WGCNA 1.20e-06 0e+00 3.058089 451 580 51
GO:0097659 nucleic acid-templated transcription GO|GO.BP 2.80e-06 0e+00 1.714422 451 2982 147
GO:2000112 regulation of cellular macromolecule biosynthetic process GO|GO.BP 3.50e-06 0e+00 1.687296 451 3133 152
GO:0031326 regulation of cellular biosynthetic process GO|GO.BP 4.60e-06 0e+00 1.644435 451 3405 161
GO:1901363 heterocyclic compound binding GO|GO.MF 8.00e-06 0e+00 1.489880 451 4832 207
GO:0003677 DNA binding GO|GO.MF 1.12e-05 0e+00 1.947733 451 1857 104
GO:0009889 regulation of biosynthetic process GO|GO.BP 1.12e-05 0e+00 1.623186 451 3471 162
GO:0010556 regulation of macromolecule biosynthetic process GO|GO.BP 1.72e-05 0e+00 1.645929 451 3254 154
GO:0097159 organic cyclic compound binding GO|GO.MF 2.51e-05 0e+00 1.473414 451 4886 207
GO:0034654 nucleobase-containing compound biosynthetic process GO|GO.BP 2.51e-05 0e+00 1.611310 451 3475 161
GO:0060255 regulation of macromolecule metabolic process GO|GO.BP 2.73e-05 0e+00 1.469703 451 4922 208
GO:0051171 regulation of nitrogen compound metabolic process GO|GO.BP 3.59e-05 1e-07 1.485616 451 4682 200
GO:0044451 nucleoplasm part GO|GO.CC 3.83e-05 1e-07 2.388518 451 961 66
GO:0043170 macromolecule metabolic process GO|GO.BP 3.99e-05 1e-07 1.315338 451 7562 286
GO:0018130 heterocycle biosynthetic process GO|GO.BP 4.45e-05 1e-07 1.596509 451 3529 162
GO:0019438 aromatic compound biosynthetic process GO|GO.BP 5.00e-05 1e-07 1.594250 451 3534 162
i = 5
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #00B9E3 (SFARI Genes = 7.33%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000052 CortexWGCNA 15-21 post-conception weeks C26 JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 0.00e+00 0e+00 2.708083 1695 721 211
JAMiller.AIBS.000142 Highest in CP of 13-16 post-conception weeks human JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex|Postmitotic brain 0.00e+00 0e+00 2.194883 1695 1210 287
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 0.00e+00 0e+00 1.524703 1695 4036 665
JAM:002769 downAD_mitochondrion JAM|BrainLists|BrainLists.Blalock_AD 0.00e+00 0e+00 3.364977 1695 264 96
JAMiller.AIBS.000150 Highest in CP of E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex|Postmitotic brain 0.00e+00 0e+00 1.878513 1695 1266 257
JAM:003016 downAD_synapticTransmission JAM|BrainLists|BrainLists.Blalock_AD 0.00e+00 0e+00 5.152621 1695 88 49
JAMiller.AIBS.000155 Lowest in VZ of E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex 0.00e+00 0e+00 1.723944 1695 1664 310
JAMiller.AIBS.000123 HippocampusWGCNA turquoise DGenriched upAge JA Miller at AIBS|Brain|Postnatal brain|WGCNA 0.00e+00 0e+00 1.921531 1695 1098 228
JAMiller.AIBS.000570 WGCNA Olivedrab2ModuleGenes with enriched ELAVL2 targets JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 0.00e+00 0e+00 2.481758 1695 481 129
JAM:003072 Tail of Caudate Nucleus_IN_Striatum JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 0.00e+00 0e+00 3.506055 1695 161 61
GO:0044456 synapse part GO|GO.CC 0.00e+00 0e+00 1.967674 1695 823 175
GO:0045202 synapse GO|GO.CC 0.00e+00 0e+00 1.834783 1695 1044 207
GO:0097060 synaptic membrane GO|GO.CC 0.00e+00 0e+00 2.418297 1695 375 98
JAMiller.AIBS.000141 CP enriched in E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex|Postmitotic brain 0.00e+00 0e+00 2.112789 1695 565 129
GO:0097458 neuron part GO|GO.CC 0.00e+00 0e+00 1.651046 1695 1418 253
JAM:002764 downAging_mitochondria_synapse JAM|BrainLists|BrainLists.Lu_Aging 0.00e+00 0e+00 2.349013 1695 390 99
JAM:002751 Basal Pons JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.globalMarker(top200)|Brain region markers|Brain region marker enriched gene sets 0.00e+00 0e+00 3.196728 1695 165 57
JAM:002744 Autism_differential_expression_across_at_least_one_comparison JAM|BrainLists|BrainLists.Voineagu 0.00e+00 0e+00 1.855589 1695 763 153
JAM:003054 subiculum_IN_Hippocampal Formation JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 0.00e+00 0e+00 3.065639 1695 163 54
JAMiller.AIBS.000005 CPi markers at 21 post-conception weeks JA Miller at AIBS|Brain|Prenatal brain|Cortex|Markers of cortex layers|Postmitotic brain 1.00e-07 0e+00 2.395777 1695 309 80
JAM:002805 Cerebral Cortex JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.globalMarker(top200)|Brain region markers|Brain region marker enriched gene sets 2.00e-07 0e+00 2.988769 1695 161 52
JAM:002739 arcuate nucleus of medulla_IN_Myelencephalon JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 2.00e-07 0e+00 2.936799 1695 167 53
JAM:002920 Lateral Nucleus_IN_Amygdala JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 5.00e-07 0e+00 2.916314 1695 165 52
GO:0099536 synaptic signaling GO|GO.BP 5.00e-07 0e+00 1.949884 1695 560 118
GO:0034220 ion transmembrane transport GO|GO.BP 7.00e-07 0e+00 1.720897 1695 898 167
GO:0098793 presynapse GO|GO.CC 1.00e-06 0e+00 2.108154 1695 417 95
JAMiller.AIBS.000095 Cortical PNOC neurons JA Miller at AIBS|Brain|Postnatal brain|Cell type markers|Cortex 2.50e-06 0e+00 1.282364 1695 3940 546
GO:0045211 postsynaptic membrane GO|GO.CC 3.10e-06 0e+00 2.346005 1695 284 72
GO:0099537 trans-synaptic signaling GO|GO.BP 3.40e-06 0e+00 1.917431 1695 555 115
JAM:002882 Hippocampal Formation JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.globalMarker(top200)|Brain region markers|Brain region marker enriched gene sets 4.00e-06 0e+00 2.809155 1695 168 51
GO:0007268 chemical synaptic transmission GO|GO.BP 4.10e-06 0e+00 1.918037 1695 550 114
GO:0098916 anterograde trans-synaptic signaling GO|GO.BP 4.10e-06 0e+00 1.918037 1695 550 114
JAMiller.AIBS.000463 Genes bound by SMAD4 in HUMAN A2780 from PMID 21799915 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 9.70e-06 0e+00 1.415427 1695 2079 318
GO:0043005 neuron projection GO|GO.CC 1.07e-05 0e+00 1.625648 1695 1036 182
GO:0030424 axon GO|GO.CC 1.10e-05 0e+00 1.942358 1695 505 106
GO:0022836 gated channel activity GO|GO.MF 2.52e-05 0e+00 2.420796 1695 237 62
JAMiller.AIBS.000042 CortexWGCNA 15-21 post-conception weeks C16 SPenriched JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 2.81e-05 0e+00 2.570469 1695 198 55
JAM:002824 Dentate Nucleus_IN_Cerebellar Nucleus JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 5.67e-05 1e-07 2.725012 1695 163 48
GO:0034702 ion channel complex GO|GO.CC 5.72e-05 1e-07 2.437355 1695 224 59
GO:0005216 ion channel activity GO|GO.MF 6.05e-05 1e-07 2.256199 1695 283 69
GO:0022839 ion gated channel activity GO|GO.MF 6.28e-05 1e-07 2.391843 1695 236 61
GO:1902495 transmembrane transporter complex GO|GO.CC 7.84e-05 1e-07 2.361023 1695 243 62
GO:0099240 intrinsic component of synaptic membrane GO|GO.CC 8.40e-05 1e-07 2.764088 1695 154 46
i = 6
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #00BECF (SFARI Genes = 9.97%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000349 Genes bound by KDM5B in MOUSE MESC from PMID 21448134 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0121595 0.0000127 1.745180 282 2964 93
JAMiller.AIBS.000209 RegionalWGCNA midfetal M39 AlternativeSplicing JA Miller at AIBS|Brain|Prenatal brain|WGCNA 0.0761661 0.0000671 2.972823 282 580 31
JAMiller.AIBS.000183 CortexWGCNA midfetal M13 JA Miller at AIBS|Brain|Prenatal brain|WGCNA 0.7377701 0.0005196 2.086836 282 1306 49
GO:0022613 ribonucleoprotein complex biogenesis GO|GO.BP 1.0000000 0.0056972 3.183105 282 332 19
GO:0000375 RNA splicing, via transesterification reactions GO|GO.BP 1.0000000 0.0374287 3.016709 282 295 16
JAMiller.AIBS.000211 RegionalWGCNA midfetal M41 mRNA metabolism JA Miller at AIBS|Brain|Prenatal brain|WGCNA 1.0000000 0.0223577 2.739929 282 406 20
GO:0006397 mRNA processing GO|GO.BP 1.0000000 0.0187850 2.697533 282 433 21
GO:0016071 mRNA metabolic process GO|GO.BP 1.0000000 0.0265071 2.200541 282 733 29
JAMiller.AIBS.000518 Genes bound by TCF4 in HUMAN U87 from PMID 23295773 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 1.0000000 0.0021888 1.587577 282 3013 86
JAMiller.AIBS.000332 Genes bound by HNF4A in human HepG2 from PMID 19822575 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 1.0000000 0.0503541 1.319920 282 5141 122



Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] BrainDiseaseCollection_1.00             
##  [2] anRichment_1.01-2                       
##  [3] TxDb.Mmusculus.UCSC.mm10.knownGene_3.4.7
##  [4] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 
##  [5] GenomicFeatures_1.36.4                  
##  [6] GenomicRanges_1.36.1                    
##  [7] GenomeInfoDb_1.20.0                     
##  [8] anRichmentMethods_0.90-1                
##  [9] WGCNA_1.69                              
## [10] fastcluster_1.1.25                      
## [11] dynamicTreeCut_1.63-1                   
## [12] GO.db_3.8.2                             
## [13] AnnotationDbi_1.46.1                    
## [14] IRanges_2.18.3                          
## [15] S4Vectors_0.22.1                        
## [16] Biobase_2.44.0                          
## [17] BiocGenerics_0.30.0                     
## [18] biomaRt_2.40.5                          
## [19] knitr_1.28                              
## [20] doParallel_1.0.15                       
## [21] iterators_1.0.12                        
## [22] foreach_1.5.0                           
## [23] polycor_0.7-10                          
## [24] expss_0.10.2                            
## [25] GGally_1.5.0                            
## [26] gridExtra_2.3                           
## [27] viridis_0.5.1                           
## [28] viridisLite_0.3.0                       
## [29] RColorBrewer_1.1-2                      
## [30] dendextend_1.13.4                       
## [31] plotly_4.9.2                            
## [32] glue_1.3.2                              
## [33] reshape2_1.4.3                          
## [34] forcats_0.5.0                           
## [35] stringr_1.4.0                           
## [36] dplyr_0.8.5                             
## [37] purrr_0.3.3                             
## [38] readr_1.3.1                             
## [39] tidyr_1.0.2                             
## [40] tibble_3.0.0                            
## [41] ggplot2_3.3.0                           
## [42] tidyverse_1.3.0                         
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.4-0                 plyr_1.8.6                 
##   [5] lazyeval_0.2.2              splines_3.6.3              
##   [7] crosstalk_1.1.0.1           BiocParallel_1.18.1        
##   [9] digest_0.6.25               htmltools_0.4.0            
##  [11] fansi_0.4.1                 magrittr_1.5               
##  [13] checkmate_2.0.0             memoise_1.1.0              
##  [15] cluster_2.1.0               annotate_1.62.0            
##  [17] Biostrings_2.52.0           modelr_0.1.6               
##  [19] matrixStats_0.56.0          prettyunits_1.1.1          
##  [21] jpeg_0.1-8.1                colorspace_1.4-1           
##  [23] blob_1.2.1                  rvest_0.3.5                
##  [25] haven_2.2.0                 xfun_0.12                  
##  [27] crayon_1.3.4                RCurl_1.98-1.1             
##  [29] jsonlite_1.6.1              genefilter_1.66.0          
##  [31] impute_1.58.0               survival_3.1-12            
##  [33] gtable_0.3.0                zlibbioc_1.30.0            
##  [35] XVector_0.24.0              DelayedArray_0.10.0        
##  [37] scales_1.1.0                mvtnorm_1.1-0              
##  [39] DBI_1.1.0                   miniUI_0.1.1.1             
##  [41] Rcpp_1.0.4                  xtable_1.8-4               
##  [43] progress_1.2.2              htmlTable_1.13.3           
##  [45] foreign_0.8-76              bit_1.1-15.2               
##  [47] preprocessCore_1.46.0       Formula_1.2-3              
##  [49] htmlwidgets_1.5.1           httr_1.4.1                 
##  [51] acepack_1.4.1               ellipsis_0.3.0             
##  [53] farver_2.0.3                pkgconfig_2.0.3            
##  [55] reshape_0.8.8               XML_3.99-0.3               
##  [57] nnet_7.3-14                 dbplyr_1.4.2               
##  [59] locfit_1.5-9.4              later_1.0.0                
##  [61] labeling_0.3                tidyselect_1.0.0           
##  [63] rlang_0.4.5                 munsell_0.5.0              
##  [65] cellranger_1.1.0            tools_3.6.3                
##  [67] cli_2.0.2                   generics_0.0.2             
##  [69] RSQLite_2.2.0               broom_0.5.5                
##  [71] fastmap_1.0.1               evaluate_0.14              
##  [73] yaml_2.2.1                  bit64_0.9-7                
##  [75] fs_1.4.0                    nlme_3.1-147               
##  [77] mime_0.9                    ggExtra_0.9                
##  [79] xml2_1.2.5                  compiler_3.6.3             
##  [81] rstudioapi_0.11             png_0.1-7                  
##  [83] reprex_0.3.0                geneplotter_1.62.0         
##  [85] stringi_1.4.6               highr_0.8                  
##  [87] lattice_0.20-41             Matrix_1.2-18              
##  [89] vctrs_0.2.4                 pillar_1.4.3               
##  [91] lifecycle_0.2.0             data.table_1.12.8          
##  [93] bitops_1.0-6                httpuv_1.5.2               
##  [95] rtracklayer_1.44.4          R6_2.4.1                   
##  [97] latticeExtra_0.6-29         promises_1.1.0             
##  [99] codetools_0.2-16            assertthat_0.2.1           
## [101] SummarizedExperiment_1.14.1 DESeq2_1.24.0              
## [103] withr_2.1.2                 GenomicAlignments_1.20.1   
## [105] Rsamtools_2.0.3             GenomeInfoDbData_1.2.1     
## [107] mgcv_1.8-31                 hms_0.5.3                  
## [109] grid_3.6.3                  rpart_4.1-15               
## [111] rmarkdown_2.1               shiny_1.4.0.2              
## [113] lubridate_1.7.4             base64enc_0.1-3